{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "85347289",
   "metadata": {},
   "source": [
    "# Chapter 3 - Text Matching"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8dca499a",
   "metadata": {},
   "source": [
    "## Text Matching Techniques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57030964",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install jellyfish\n",
    "import jellyfish as jf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97782486",
   "metadata": {},
   "outputs": [],
   "source": [
    "jf.levenshtein_distance('Michael','Micheal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "294e778a",
   "metadata": {},
   "outputs": [],
   "source": [
    "min(jf.levenshtein_distance('el','ael'), jf.levenshtein_distance('ael','al'), jf.levenshtein_distance('el','al'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5ab5b2c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "jf.jaro_similarity('Michael','Micheal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73b6035b",
   "metadata": {},
   "outputs": [],
   "source": [
    "jf.jaro_winkler_similarity('Michael','Micheal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f78d57bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "jf.jaro_winkler_similarity('michael','MICHAEL')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34e1a7bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "jf.jaro_winkler_similarity('michael'.lower(),'MICHAEL'.lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c509f40d",
   "metadata": {},
   "outputs": [],
   "source": [
    "jf.metaphone('michael')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8dc55e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "jf.metaphone('micheal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c882782",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11c52432",
   "metadata": {},
   "outputs": [],
   "source": [
    "mylist = ['Michael','Micheal','Michel','Mike','Mick']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab2f2a53",
   "metadata": {},
   "outputs": [],
   "source": [
    "combs = []\n",
    "import itertools\n",
    "for a, b in itertools.combinations(mylist, 2):\n",
    "    combs.append([a,b,jf.jaro_similarity(a,b),jf.jaro_winkler_similarity(a, b), jf.levenshtein_distance(a,b), jf.match_rating_comparison(a,b),(jf.metaphone(a)==jf.metaphone(b))])\n",
    "pd.DataFrame(combs, columns=['Name1','Name2','Jaro','JaroW','Levenshtein','Match Rating','Metaphone'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "84d1b1e6",
   "metadata": {},
   "source": [
    "## Sample Problem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c511d5e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un = pd.read_csv('mps_unmatched.csv')\n",
    "df_w_un"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9b7cdf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un['Firstname_jaro'] = df_w_un.apply(lambda x: jf.jaro_winkler_similarity(x.Firstname_w, x.Firstname_t), axis=1)\n",
    "df_w_un['Lastname_jaro'] = df_w_un.apply(lambda x: jf.jaro_winkler_similarity(x.Lastname_w, x.Lastname_t), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09ad6064",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un[(df_w_un['Firstname_jaro'] > 0.8) & (df_w_un['Lastname_jaro'] > 0.8)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc59c5dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_w = pd.read_csv('mps_wiki_clean.csv')\n",
    "df_t = pd.read_csv('mps_they_clean.csv')\n",
    "\n",
    "cross = df_w.merge(df_t, how='cross',suffixes=('_w', '_t'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c982dc8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "cross.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2104a348",
   "metadata": {},
   "outputs": [],
   "source": [
    "cross['Firstname_jaro'] = cross.apply(lambda x: True if jf.jaro_winkler_similarity(x.Firstname_w, x.Firstname_t)>0.8 else False, axis=1)\n",
    "cross['Lastname_jaro'] = cross.apply(lambda x: True if jf.jaro_winkler_similarity(x.Lastname_w, x.Lastname_t)>0.8 else False, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91c1998b",
   "metadata": {},
   "outputs": [],
   "source": [
    "cross.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96ce884d",
   "metadata": {},
   "outputs": [],
   "source": [
    "tp = cross[(cross['Firstname_jaro'] & cross['Lastname_jaro']) & (cross['Constituency_w']==cross['Constituency_t'])]\n",
    "len(tp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93cb7a61",
   "metadata": {},
   "outputs": [],
   "source": [
    "fp = cross[(cross['Firstname_jaro'] & cross['Lastname_jaro']) & (cross['Constituency_w']!=cross['Constituency_t'])]\n",
    "len(fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4c0783a",
   "metadata": {},
   "outputs": [],
   "source": [
    "fp.head(n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6245c17",
   "metadata": {},
   "outputs": [],
   "source": [
    "fntn = cross[(~cross['Firstname_jaro'] | ~cross['Lastname_jaro']) & (cross['Constituency_w']==cross['Constituency_t'])]\n",
    "len(fntn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f37fc70",
   "metadata": {},
   "outputs": [],
   "source": [
    "fntn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64ee802b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w['Firstname'].value_counts().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a4ea4fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w['Lastname'].value_counts().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88395666",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w['Constituency'].value_counts().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9938f2c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un['Firstname_w_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Firstname_w), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b98ff150",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94972568",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un['Firstname_t_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Firstname_t), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebcc2165",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe6ab53f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un['Lastname_w_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Lastname_w), axis=1)\n",
    "df_w_un['Lastname_t_meta'] = df_w_un.apply(lambda x: jf.metaphone(x.Lastname_t), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54e2cba6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w_un"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "handsonentityresolution",
   "language": "python",
   "name": "handsonentityresolution"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
